Introduction

The aim is to combine label transfer and CNV inference to annotate Wilms tumor samples in SCPCP000006. The proposed annotation will be based on the combination of:

  • the label transfer from the fetal kidney reference (Stewart et al.), in particular the fetal_kidney_predicted.compartment and fetal_kidney:predicted.cell_type, as well as the prediction.score for each compartment,

  • the predicted CNV calculated using intra-sample endothelial and immune cells (--reference both) as normal reference; no reference was used for samples with fewer than 3 predicted normal cells.

In a second time, we will explore and validate the chosen annotation.

We will use some of the markers genes to validate visually the annotations.

The analysis can be summarized as the following:

Where cnv.thr and pred.thr need to be discussed

first level annotation second level annotation selection of the cells marker genes for validation CNV validation
normal endothelial compartment == "endothelium" & predicted.score > pred.thr VWF no CNV
normal immune compartment == "immune" & predicted.score > pred.thr PTPRC, CD163, CD68 no CNV
normal kidney compartment == "fetal_nephron" & predicted.score > pred.thr & cnv_score < cnv.thr CDH1, PODXL, LTL no CNV
normal stroma compartment == "stroma" & predicted.score > pred.thr & cnv_score < cnv.thr VIM no CNV
cancer stroma compartment == "stroma" & cnv_score > cnv.thr VIM proportion_cnv_chr: 1, 4, 11, 16, 17, 18
cancer blastema compartment == "fetal_nephron" & cell_type == "mesenchymal cell" & cnv_score > cnv.thr CITED1 proportion_cnv_chr: 1, 4, 11, 16, 17, 18
cancer epithelial compartment == "fetal_nephron" & cell_type != "mesenchymal cell" & cnv_score > cnv.thr CDH1 proportion_cnv_chr: 1, 4, 11, 16, 17, 18
unknown - the rest of the cells - -

Packages

library(Seurat)
library(tidyverse)
library(patchwork)
library(DT)

Base directories

# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)

# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current", "SCPCP000006")

# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06")

result_dir <- file.path(module_base, "results")

Output files

The report will be saved in the notebook directory. The final annotation file SCPCP000006-annotations.tsv will be saved in the results directory.

# cell type annotation table
annotations_tsv <- file.path(result_dir, "SCPCP000006-annotations.tsv")

Input files

In this notebook, we are working with all of the samples in SCPCP000006.

The sample metadata can be found in sample_metadata_file in the data folder.

We extracted from the pre-processed and labeled Seurat object from results/{sample_id}/06_infercnv_HMM-i3_{sample_id}_reference-{reference}.rds. the following information per cell:

  • the predicted compartment in fetal_kidney_predicted.compartment and fetal_kidney_predicted.compartment.score

  • the predicted compartment in fetal_kidney_predicted.cell_type and fetal_kidney_predicted.cell_type.score

  • the sample identifier in sample_id

  • the CNV {dupli|loss} estimation per chromosome {i} and arm {p|q}: has_{dupli|loss}_chr{i}{p|q}, in 1 to 22

  • the counts of markers genes

We will also use:

  • the samples metadata or clinical data reported in /home/rstudio/data/current/SCPCP000006/single_cell_metadata.tsv

  • the cytoband file to get the length of each chromosome arm (for plotting only)

# sample metadata
sample_metadata_file <- file.path(repository_base, "data", "current", "SCPCP000006", "single_cell_metadata.tsv")
metadata <- read.table(sample_metadata_file, sep = "\t", header = TRUE)

# the cytoband file that will be used to extract the size of each chromosome arm for plotting only
arm_order_file <- file.path(module_base, "results", "references", "hg38_cytoBand.txt")
# Extract from each Seurat object information that will be used
sample_ids <- metadata |>
  dplyr::filter(seq_unit != "spot") |>
  dplyr::pull(scpca_sample_id) |>
  unique()

# These samples were run with "none" as the reference
none_reference_samples <- c("SCPCS000177", "SCPCS000180", "SCPCS000181", "SCPCS000190", "SCPCS000197")

# Create a data frames of all annotations
cell_type_df <- sample_ids |>
  purrr::map(
    # For each sample_id, do the following:
    \(sample_id) {
      if (sample_id %in% none_reference_samples) {
        reference <- "none"
      } else {
        reference <- "both"
      }

      input_file <- file.path(
        result_dir,
        sample_id,
        glue::glue("06_infercnv_HMM-i3_{sample_id}_reference-{reference}.rds")
      )

      # The file may not be present if this is being run in CI, which is ok.
      # If we are not running in CI and the file doesn't exist, we should error out
      # We should error out if the file does not exist and we are NOT testing
      if (!file.exists(input_file)) {
        if (params$testing) {
          return(NULL)
        } else {
          stop("Input RDS file does not exist.")
        }
      }

      # Read in the Seurat object
      srat <- readRDS(input_file)

      # Create and return a data frame from the Seurat object with relevant annotations
      # this data frame will have four columns: barcode, sample_id, compartment, organ
      data.frame(
        # label transfer from the fetal kidney reference
        cell_type = srat$fetal_kidney_predicted.cell_type,
        compartment = srat$fetal_kidney_predicted.compartment,

        # predicted.scores from the label transfer from the fetal kidney reference
        cell_type.score = srat$fetal_kidney_predicted.cell_type.score,
        compartment.score = srat$fetal_kidney_predicted.compartment.score,

        # cell embedding
        umap = srat@reductions$umap@cell.embeddings,

        # marker genes
        PTPRC = FetchData(object = srat, vars = "ENSG00000081237", layer = "counts"),
        VWF = FetchData(object = srat, vars = "ENSG00000110799", layer = "counts"),
        VIM = FetchData(object = srat, vars = "ENSG00000026025", layer = "counts"),
        CITED1 = FetchData(object = srat, vars = "ENSG00000125931", layer = "counts"),
        CDH1 = FetchData(object = srat, vars = "ENSG00000039068", layer = "counts"),
        PODXL = FetchData(object = srat, vars = "ENSG00000128567", layer = "counts"),
        COL6A3 = FetchData(object = srat, vars = "ENSG00000163359", layer = "counts"),
        SIX2 = FetchData(object = srat, vars = "ENSG00000170577", layer = "counts"),
        NCAM1 = FetchData(object = srat, vars = "ENSG00000149294", layer = "counts"),
        THY1 = FetchData(object = srat, vars = "ENSG00000154096", layer = "counts"),

        # loss global estimation per chromosome
        has_loss_chr1p = srat$has_loss_chr1p,
        has_loss_chr2p = srat$has_loss_chr2p,
        has_loss_chr3p = srat$has_loss_chr3p,
        has_loss_chr4p = srat$has_loss_chr4p,
        has_loss_chr5p = srat$has_loss_chr5p,
        has_loss_chr6p = srat$has_loss_chr6p,
        has_loss_chr7p = srat$has_loss_chr7p,
        has_loss_chr8p = srat$has_loss_chr8p,
        has_loss_chr9p = srat$has_loss_chr9p,
        has_loss_chr10p = srat$has_loss_chr10p,
        has_loss_chr11p = srat$has_loss_chr11p,
        has_loss_chr12p = srat$has_loss_chr12p,
       
        has_loss_chr16p = srat$has_loss_chr16p,
        has_loss_chr17p = srat$has_loss_chr17p,
        has_loss_chr18p = srat$has_loss_chr18p,
        has_loss_chr19p = srat$has_loss_chr19p,
        has_loss_chr20p = srat$has_loss_chr20p,
        
        
        has_loss_chr1q = srat$has_loss_chr1q,
        has_loss_chr2q = srat$has_loss_chr2q,
        has_loss_chr3q = srat$has_loss_chr3q,
        has_loss_chr4q = srat$has_loss_chr4q,
        has_loss_chr5q = srat$has_loss_chr5q,
        has_loss_chr6q = srat$has_loss_chr6q,
        has_loss_chr7q = srat$has_loss_chr7q,
        has_loss_chr8q = srat$has_loss_chr8q,
        has_loss_chr9q = srat$has_loss_chr9q,
        has_loss_chr10q = srat$has_loss_chr10q,
        has_loss_chr11q = srat$has_loss_chr11q,
        has_loss_chr12q = srat$has_loss_chr12q,
        has_loss_chr13q = srat$has_loss_chr13q,
        has_loss_chr14q = srat$has_loss_chr14q,
        has_loss_chr15q = srat$has_loss_chr15q,
        has_loss_chr16q = srat$has_loss_chr16q,
        has_loss_chr17q = srat$has_loss_chr17q,
        has_loss_chr18q = srat$has_loss_chr18q,
        has_loss_chr19q = srat$has_loss_chr19q,
        has_loss_chr20q = srat$has_loss_chr20q,
        has_loss_chr21q = srat$has_loss_chr21q,
        has_loss_chr22q = srat$has_loss_chr22q,
        
        # dupli global estimation per chromosome
        has_dupli_chr1p = srat$has_dupli_chr1p,
        has_dupli_chr2p = srat$has_dupli_chr2p,
        has_dupli_chr3p = srat$has_dupli_chr3p,
        has_dupli_chr4p = srat$has_dupli_chr4p,
        has_dupli_chr5p = srat$has_dupli_chr5p,
        has_dupli_chr6p = srat$has_dupli_chr6p,
        has_dupli_chr7p = srat$has_dupli_chr7p,
        has_dupli_chr8p = srat$has_dupli_chr8p,
        has_dupli_chr9p = srat$has_dupli_chr9p,
        has_dupli_chr10p = srat$has_dupli_chr10p,
        has_dupli_chr11p = srat$has_dupli_chr11p,
        has_dupli_chr12p = srat$has_dupli_chr12p,
       
        has_dupli_chr16p = srat$has_dupli_chr16p,
        has_dupli_chr17p = srat$has_dupli_chr17p,
        has_dupli_chr18p = srat$has_dupli_chr18p,
        has_dupli_chr19p = srat$has_dupli_chr19p,
        has_dupli_chr20p = srat$has_dupli_chr20p,
        
        
        has_dupli_chr1q = srat$has_dupli_chr1q,
        has_dupli_chr2q = srat$has_dupli_chr2q,
        has_dupli_chr3q = srat$has_dupli_chr3q,
        has_dupli_chr4q = srat$has_dupli_chr4q,
        has_dupli_chr5q = srat$has_dupli_chr5q,
        has_dupli_chr6q = srat$has_dupli_chr6q,
        has_dupli_chr7q = srat$has_dupli_chr7q,
        has_dupli_chr8q = srat$has_dupli_chr8q,
        has_dupli_chr9q = srat$has_dupli_chr9q,
        has_dupli_chr10q = srat$has_dupli_chr10q,
        has_dupli_chr11q = srat$has_dupli_chr11q,
        has_dupli_chr12q = srat$has_dupli_chr12q,
        has_dupli_chr13q = srat$has_dupli_chr13q,
        has_dupli_chr14q = srat$has_dupli_chr14q,
        has_dupli_chr15q = srat$has_dupli_chr15q,
        has_dupli_chr16q = srat$has_dupli_chr16q,
        has_dupli_chr17q = srat$has_dupli_chr17q,
        has_dupli_chr18q = srat$has_dupli_chr18q,
        has_dupli_chr19q = srat$has_dupli_chr19q,
        has_dupli_chr20q = srat$has_dupli_chr20q,
        has_dupli_chr21q = srat$has_dupli_chr21q,
        has_dupli_chr22q = srat$has_dupli_chr22q
      ) |>
        tibble::rownames_to_column("barcode") |>
        dplyr::mutate(sample_id = sample_id)
    }
  ) |>
  # now combine all dataframes to make one big one
  dplyr::bind_rows()
# Add clinical metadata
cell_type_df <- cell_type_df |>
  left_join( metadata, by= c("sample_id" = "scpca_sample_id"))

Functions

do_Feature_mean

do_Feature_mean shows heatmap of mean expression of a feature grouped by a metadata.

  • df is the name of the table containing metadata and feature expression (counts) per cells
  • group.by is the name of the metadata to group the cells
  • feature is the name of the gene to average the expression
do_Feature_mean <- function(df, group.by, feature) {
  df <- df |>
    group_by(sample_id, !!sym(group.by)) |>
    summarise(m = mean(!!sym(feature)))


  p <- ggplot(df, aes(x = sample_id, y = !!sym(group.by), fill = m)) +
    geom_tile() +
    scale_fill_viridis_c() +
    theme_bw() +
    theme(text = element_text(size = 20)) +
    theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5)) +
    guides(fill = guide_colourbar(title = paste0(feature)))

  return(p)
}

do_BoxPlot_mean

do_BoxPlot_mean shows boxplot of the percentage of cells with CNV in a specific chromosome arm per clinical relevant group (metadata)

  • df is the name of the table containing metadata and feature expression (counts) per cells
  • group.by is the name of the metadata to group the cells
  • comparison is a list of factors in group.by for which we will compare the means using an unpaired Wilcoxon test
do_BoxPlot_mean <- function(df, group.by, comparison){
  tmp <- df |>
    select(!!sym(group.by), sample_id, contains("has_loss_chr"), contains("has_dupli_chr")) |>
    group_by(!!sym(group.by), sample_id) |>
    summarise_all(.funs = mean) |>
    pivot_longer(cols = -c(sample_id,!!sym(group.by)) ) |>
    rename("chromosome_arm" = "name", "percentage" = "value") |>
    mutate(chromosome_arm = factor(chromosome_arm, levels = c(has_dupli_chr, has_loss_chr)))
  
  
  p <- ggplot(tmp, aes(x = !!sym(group.by), y = percentage, fill = !!sym(group.by))) +
  geom_boxplot()+
  scale_fill_brewer(palette="PuRd") +
  geom_dotplot(binaxis='y', stackdir='center') +
  facet_wrap(~chromosome_arm, nrow = 5 ) +
  theme_classic()  

  return(p)
}

Analysis

Global CNV score

As done in 06_cnv_infercnv_exploration.Rmd, we calculate single CNV score and assess its potential in identifying cells with CNV versus normal cells without CNV.

We simply checked for each chromosome if the cell has_cnv_chr. Would the cell have less than 0 chromosome with CNV, the global has_cnv_score will be FALSE. Would the cell have more than 2 chromosome with CNV, the global has_cnv_score will be TRUE. The default is set to unknown.

The infercnv method can lead to false positive CNV. We introduced a flexibility of +2 over the cnv_threshold to reduce the risk of misclassification of normal cells as cancer cells.

Please note that some cancer cell might not have any CNV. There is thus a risk of misclassifying cancer cells as normal cells. However, we cannot avoid this risk with the data and tools currently available. This is a limitation of our annotations.

cell_type_df <- cell_type_df |>
  mutate(has_cnv_score = rowSums(cell_type_df[, grepl("has_loss_chr|has_dupli_chr", colnames(cell_type_df))])) |>
  mutate(has_cnv_score = case_when(
    has_cnv_score > params$cnv_threshold_high ~ "CNV",
    has_cnv_score <= params$cnv_threshold_low ~ "no CNV",
    .default = "unknown"
  ))

First level annotation

At first, we like to indicate in the first.level_annotation if a cell is normal, cancer or unknown.

  • normal cells can be observe in all five compartments (normal,endothelium, immune, stroma or fetal nephron) and do not have CNV. We only allow a bit of flexibility in terms of CNV profile for immune and endothelium cells that have a high predicted score. Indeed, we know that false positive CNV can be observed in a cell type specific manner. Please note that in 06_infercnv.R, we renamed all immune and endothelium cells with a fetal_kidney_predicted.compartment.score > 0.75 as normal.

The threshold used for the predicted.score is defined as a parameter of this notebook as 0.75. The thresholds used for the identification of CNV are also defined as notebook parameters with a minimum of 0 and a maximum of 2.

  • cancer cells are either from the stroma or fetal nephron compartments and must have at least few CNV
# Define normal cells
# We first pick up the immune and endothelial cells annotated via the label transfer compartments under the condition that the predicted score is above the threshold
cell_type_df <- cell_type_df |>
  mutate(first.level_annotation = case_when(
    # assign normal/cancer based on condition
    compartment %in% c("fetal_nephron", "stroma") & 
      has_cnv_score == "no CNV" ~ "normal",
    
    compartment %in% c("normal", "immune", "endothelium") &
      cell_type.score > params$predicted.celltype.threshold ~ "normal",
    
    compartment %in% c("fetal_nephron", "stroma") & 
      has_cnv_score == "CNV" ~ "cancer",
    
    .default = "unknown"
  ))

Using this basic strategy, we identified 156207 cancer cells, 12777 normal cells. 31553cells remain unknown

ggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = first.level_annotation), shape = 19, size = 1) +
  geom_point() +
  facet_wrap(facets = ~sample_id, ncol = 5) +
  theme_bw() +
  theme(text = element_text(size = 22))

DT::datatable(table(cell_type_df$first.level_annotation, cell_type_df$sample_id))

Strikingly, 5 samples show more normal cells than cancer cells:

  • SCPCS000177
  • SCPCS000180
  • SCPCS000181
  • SCPCS000190
  • SCPCS000197

These five samples do not have enough immune and endothelium cell to be used as a normal reference in scripts/06_infercnv.R and we previously decided to run scripts/06_infercnv.R without any reference for these samples. In that case, the mean expression profile over the sample is taken as the normal reference, biaising the inference of CNV.

For those samples, the annotations based on infercnv cannot be trust. To avoid any confusion in the following steps of the analysis, we decided to force the annotation of the entire samples to unknown.

cell_type_df$first.level_annotation[cell_type_df$sample_id %in% c("SCPCS000177", "SCPCS000180", "SCPCS000181", "SCPCS000190", "SCPCS000197")] <- "unknown"

Second level annotation

Normal cells

  • Normal cells from the fetal nephron compartment must be normal kidney cells.

  • Normal cells from the stroma compartment must be normal stroma cells.

  • Immune and endothelial cells have been already identified by label transfer.

Cancer cells

Wilms tumor cancer cells can be:

  • cancer stroma: We define as cancer stroma all cancer cells from the stroma compartment.

  • blastema,: we defined as blastema every cancer cell that has a fetal_kidney_predicted.cell_type == mesenchymal cell. We know that these mesenchymal cells are cells from the cap mesenchyme that are not expected to be in a mature kidney. These blastema cells should express higher CITED1 and/or NCAM1.

  • cancer epithelium: we defined as cancer epithelium all cancer cells that are neither stroma nor blastemal cells. We expect these cells to express epithelial markers. Their predicted cell type should correspond to more mature kidney epithelial subunits.

cell_type_df <- cell_type_df |>
  mutate(second.level_annotation = case_when(
    # assign normal cells based on condition
    cell_type_df$compartment %in% c("fetal_nephron") &
      cell_type_df$has_cnv_score == "no CNV" ~ "kidney",
   
    cell_type_df$compartment %in% c("stroma") &
      cell_type_df$has_cnv_score == "no CNV" ~ "normal stroma",
    
    cell_type_df$compartment %in% c("normal") &
      cell_type_df$cell_type %in% c("endothelial cell") ~ "endothelium",
    cell_type_df$compartment %in% c("normal") &
      cell_type_df$cell_type %in% c("B cell",
                                    "CD4-positive, alpha-beta T cell",
                                    "conventional dendritic cell",
                                    "lymphocyte",
                                    "macrophage",
                                    "mast cell",
                                    "monocyte",
                                    "natural killer cell",
                                    "neutrophil",
                                    "plasmacytoid dendritic cell") ~ "immune",

    # assign cancer cells based on condition
    cell_type_df$compartment %in% c("stroma") &
      cell_type_df$has_cnv_score == "CNV" ~ "cancer stroma",
    cell_type_df$compartment %in% c("fetal_nephron") &
      cell_type_df$has_cnv_score == "CNV" &
      cell_type_df$cell_type == "mesenchymal cell" ~ "blastema",
    cell_type_df$compartment %in% c("fetal_nephron") &
      cell_type_df$has_cnv_score == "CNV" &
      cell_type_df$cell_type != "mesenchymal cell" ~ "cancer epithelium",
    .default = "unknown"
  ))

cell_type_df$second.level_annotation[cell_type_df$sample_id %in% c("SCPCS000177", "SCPCS000180", "SCPCS000181", "SCPCS000190", "SCPCS000197")] <- "unknown"

Second.level_annotation of cancer and normal cells - umap reduction

ggplot(cell_type_df, aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation)) +
  geom_point(size = 1, shape = 19) +
  facet_wrap(facets = ~sample_id, ncol = 5) +
  theme_bw() +
  theme(text = element_text(size = 22))

Second.level_annotation of cancer and normal cells without unknown - umap reduction

cell_type_df_sub <- cell_type_df[cell_type_df$second.level_annotation != "unknown",]
ggplot(cell_type_df_sub, aes(x = umap.umap_1, y = umap.umap_2, color = second.level_annotation)) +
  geom_point(size = 1, shape = 19) +
  facet_wrap(facets = ~sample_id, ncol = 5) +
  theme_bw() +
  theme(text = element_text(size = 22))

Per sample, the annotations in the umap reduction seem to make sense as we can the three main Wilms tumor components are easily distinguished:

  • cancer blastema
  • epithelium
  • stroma

Validation cancer versus normal based on the CNV profile

We look for each second.level_annotation and sample_id at the proportion of cells that has CNV in each of the chromosome.

These heatmaps allow us to:

  • validate that mostly tumor cells do present high percentages of cells with CNV (by definition)

  • identify chromosome and cell types that might be more sensitive to FALSE positive CNV, such as the chr6p which is a hub for immune genes

  • get the repartition per sample of the CNV

Loss on long arms

for (i in 1:22) {
  print(do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = glue::glue("has_loss_chr", i, "q")))
}

gains on long arms

for (i in 1:22) {
  print(do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = glue::glue("has_dupli_chr", i, "q")))
}

loss on short arms

for (i in c(1:12, 16:20)) {
  print(do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = glue::glue("has_loss_chr", i, "p")))
}

gains on short arms

for (i in c(1:12, 16:20)) {
  print(do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = glue::glue("has_dupli_chr", i, "p")))
}

Mean CNV profile

While the above heatmaps can be informative, we have to acknowledge that there are numerous and it is difficult to get a global picture of the CNV profile of the entire dataset.

Here we aim to plot a mean CNV profile across the 35 annotated samples as presented in Figure 1A of Cresswell et al, BioRXiv (2024). This represent the mean SNP Array profile of 64 pediatric kidney cancer (including 60 Wilms tumors).

Here we plot for each chromosome arm (x-axis) the number of samples having a dupli (y-axis > 0, in red) or a loss (y-axis < 0, in blue).

For a sample, a CNV is defined as clonal (opaque) if detected in more than 70% of the cells; and subclonal (transparent) when detected in 20 to 70% of the cells.

# Here we define the vector of ordered chromosomes arms that will be used for plotting

has_dupli_chr <- c("has_dupli_chr1p",                                      
                   "has_dupli_chr1q",

                   "has_dupli_chr2p",
                   "has_dupli_chr2q",
                   
                   "has_dupli_chr3p",
                   "has_dupli_chr3q",
                   
                   "has_dupli_chr4p",
                   "has_dupli_chr4q",
                   
                   "has_dupli_chr5p",
                   "has_dupli_chr5q",
                   
                   "has_dupli_chr6p",
                   "has_dupli_chr6q",
                   
                   "has_dupli_chr7p",
                   "has_dupli_chr7q",
                   
                   "has_dupli_chr8p",
                   "has_dupli_chr8q",
                   
                   "has_dupli_chr9p",
                   "has_dupli_chr9q",
                   
                   "has_dupli_chr10p",
                   "has_dupli_chr10q",
                   
                   "has_dupli_chr11p",
                   "has_dupli_chr11q",
                   
                   "has_dupli_chr12p",
                   "has_dupli_chr12q",
                   
                   "has_dupli_chr13p",
                   "has_dupli_chr13q",
                   
                   "has_dupli_chr14p",
                   "has_dupli_chr14q",
                   
                   "has_dupli_chr15p",
                   "has_dupli_chr15q",
                   
                   "has_dupli_chr16p",
                   "has_dupli_chr16q",
                   
                   "has_dupli_chr17p",
                   "has_dupli_chr17q",
                   
                   "has_dupli_chr18p",
                   "has_dupli_chr18q",
                   
                   "has_dupli_chr19p",
                   "has_dupli_chr19q",
                   
                   "has_dupli_chr20p",
                   "has_dupli_chr20q",
                   
                   "has_dupli_chr21p",
                   "has_dupli_chr21q",
                   
                   "has_dupli_chr22p",
                   "has_dupli_chr22q"
                   )

#################################################################################

# define the vector of ordered chromosomes arms##################################
has_loss_chr <-  c("has_loss_chr1p",                                        
                   "has_loss_chr1q",

                   "has_loss_chr2p",
                   "has_loss_chr2q",
                   
                   "has_loss_chr3p",
                   "has_loss_chr3q",
                   
                   "has_loss_chr4p",
                   "has_loss_chr4q",
                   
                   "has_loss_chr5p",
                   "has_loss_chr5q",
                   
                   "has_loss_chr6p",
                   "has_loss_chr6q",
                   
                   "has_loss_chr7p",
                   "has_loss_chr7q",
                   
                   "has_loss_chr8p",
                   "has_loss_chr8q",
                   
                   "has_loss_chr9p",
                   "has_loss_chr9q",
                   
                   "has_loss_chr10p",
                   "has_loss_chr10q",
                   
                   "has_loss_chr11p",
                   "has_loss_chr11q",
                   
                   "has_loss_chr12p",
                   "has_loss_chr12q",
                   
                   "has_loss_chr13p",
                   "has_loss_chr13q",
                   
                   "has_loss_chr14p",
                   "has_loss_chr14q",
                   
                   "has_loss_chr15p",
                   "has_loss_chr15q",
                   
                   "has_loss_chr16p",
                   "has_loss_chr16q",
                   
                   "has_loss_chr17p",
                   "has_loss_chr17q",
                   
                   "has_loss_chr18p",
                   "has_loss_chr18q",
                   
                   "has_loss_chr19p",
                   "has_loss_chr19q",
                   
                   "has_loss_chr20p",
                   "has_loss_chr20q",
                   
                   "has_loss_chr21p",
                   "has_loss_chr21q",
                   
                   "has_loss_chr22p",
                   "has_loss_chr22q"
                   )


#################################################################################
# Here we define the size of each chromosome arm that will be used for plotting of the mean CNV profile

  # Load cytoBand file
  cytoBand <- read.table(gzfile(arm_order_file), header = FALSE, sep = "\t", stringsAsFactors = FALSE)
  colnames(cytoBand) <- c("chrom", "start", "end", "band", "stain")
  
  # Extract chromosome arm information
  cytoBand <- cytoBand |>
    mutate(arm = substr(band, 1, 1))  # Extract 'p' or 'q' from the band column
  
  # Define arm boundaries
  chromosome_arms <- cytoBand |>
    group_by(chrom, arm) |>
    summarize(
      Start = min(start),
      End = max(end),
      .groups = "drop"
    ) |>
    mutate(Size = End - Start,
           dupli = paste0("has_dupli_", chrom, arm),
           loss = paste0("has_loss_", chrom, arm)) |> 
  filter(arm != "")
  
  chromosome_arms <- chromosome_arms |>
    mutate(Size = Size / sum(Size))
# prepare the table for the number of sample with a (sub)clonal loss

CNV_profile_loss <- cell_type_df_sub |>
  select(sample_id, contains("has_loss_chr")) |>
  group_by(sample_id) |>
  summarise_all(.funs = mean) |>
  pivot_longer(cols = -sample_id) |>
  rename("chromosome_arm" = "name", "percentage" = "value") |>
  left_join( chromosome_arms, by = c("chromosome_arm"="loss")) |>
  mutate(chromosome_arm = factor(chromosome_arm, levels = has_loss_chr)) |>
  mutate(absolute_70 = case_when((percentage > 0.7) ~ -1,
         .default = 0),
         absolute_10 = case_when((percentage > 0.1) ~ -1,
         .default = 0)) 


# prepare the table for the number of sample with a (sub)clonal gain

CNV_profile_gain <- cell_type_df_sub |>
  select(sample_id, contains("has_dupli_chr")) |>
  group_by(sample_id) |>
  summarise_all(.funs = mean) |>
  pivot_longer(cols = -sample_id) |>
  rename("chromosome_arm" = "name", "percentage" = "value") |>
  left_join( chromosome_arms, by = c("chromosome_arm"="dupli")) |>
  mutate(chromosome_arm = factor(chromosome_arm, levels = has_dupli_chr)) |>
  mutate(absolute_70 = case_when((percentage > 0.7) ~ 1,
         .default = 0),
         absolute_10 = case_when((percentage > 0.1) ~ 1,
         .default = 0)) 


# plot

loss_profile <- ggplot(CNV_profile_loss, aes(x = chromosome_arm, y = absolute_10, width = 20 * Size)) +
  geom_bar(stat = "identity",  fill = "#41b6c4", alpha = 0.4) +
  geom_bar(aes(x = chromosome_arm, y = absolute_70, width = 20 * Size), stat = "identity",  fill = "#41b6c4") +
  scale_x_discrete(labels = gsub("has_loss_", "", levels(droplevels(CNV_profile_loss$chromosome_arm))))  +
  theme_classic() +
  ylab("Number of occurrence") +
  xlab("chromosom arms") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 12), text = element_text(size = 15))
## Warning in geom_bar(aes(x = chromosome_arm, y = absolute_70, width = 20 * :
## Ignoring unknown aesthetics: width
gain_profile <- ggplot(CNV_profile_gain, aes(chromosome_arm, y = absolute_10, width = 20* Size)) +
  geom_bar(stat = "identity",  fill = "#ce1256", alpha = 0.4) +
  geom_bar(aes(x = chromosome_arm, y = absolute_70, width = 20 * Size), stat = "identity",  fill = "#ce1256") +
  theme_classic() +
  theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank(), 
        text = element_text(size = 15)) +
  ylab(" ") 
## Warning in geom_bar(aes(x = chromosome_arm, y = absolute_70, width = 20 * :
## Ignoring unknown aesthetics: width
# combined plot
wrap_plots(gain_profile, loss_profile, ncol = 1)

Comparing with the profile in Cresswell et al, BioRXiv (2024), we identified important CNV known to play a role in Wilms tumor, including:

  • chr8q, chr12, chr13, chr18q gains

  • chr11q, chr16, chr17q loss

However, we spotted some differences:

Our major concern is the pattern of the chr1. We do see a tendency of chr1p loss and chr1q gain but it is not as pronounced as in Cresswell et al, BioRXiv (2024). chr1q gain is however one of the most promising predictive marker of Wilms tumor outcome (Nelson et al, Curr Opin Pediatr (2020)). It is however important to notice that the profile shown in Cresswell et al, BioRXiv (2024) derived from SIOP samples that have been pre-treated with chemotherapy, while samples in the dataset are enriched in upfront resection samples that haven’t been pre-treated, according to the COG protocol. So far, we do not know how the pre-operative chemotherapy impact the clonal selection of specific CNV.

Mean of the percentage of cells with CNV in a specific chromosome arm per clinical relevant group

vital_status

As described in Nelson et al, Curr Opin Pediatr (2020), Phelps et al, Children (2018) and Zheng et al, Front. Oncol (2023), LOH at chr1p, chr11p and chr16q as well as chr1q gain are prognostic factors.

Here we aim to see if we identify CNV associated with poorer prognosis.

do_BoxPlot_mean(df = cell_type_df_sub, group.by = "vital_status", comparison = c("Alive", "Expired") )

In our study, we validate the association of chr1q gain, chr11q loss and chr14q loss with poorer prognosis.

treatment

As we don’t know the effect of the treatment on the selection of cells with CNV, it is difficult to compare our CNV profile with the one in Cresswell et al, BioRXiv (2024). We those check the repartition of CNV in samples that have been pre-treated (i.e Resection post chemotherapy) with samples that have been removed surgically before any treatment (i.e Upfront resection).

do_BoxPlot_mean(df = cell_type_df_sub, group.by = "treatment", comparison = c("Upfront resection", "Resection post chemotherapy") )

Our analysis suggest that there is indeed an enrichment in cells with CNV after chemotherapy, including a tendency of more chr1q gain samples after chemotherapy. This could explain some of the differences observed between our mean CNV profile and the one published in Cresswell et al, BioRXiv (2024).

Validation of second level annotation using marker genes

Finally, we aim to validate the second level annotation using marker genes.

Immune, PTPRC expression

do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000081237")

Immune cells are well annotated based on PTPRC expression.

Endothelium, VWF expression

do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000110799")

Endothelial cells are well annotated based on VWF expression.

Stroma

(*) To the best of our knowledges, there is no Wilms tumor specific marker of stroma cells. They should express in some extend similar markers as normal stromal cells including VIM, THY1 and COL6A3.

Vimentin expression
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000026025")

VIM do not seem to be neither specific nor universal.

COL6A3 expression
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000163359")

COL6A3 expression is higher in Wilms tumor stroma cells and expressed across a wider range of samples.

THY1 expression
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000154096")

THY1 do not seem to be neither specific nor universal.

Blastema

(*) To the best of our knowledges, there is no Wilms tumor specific and universel (i.e. expressed by all cells in every sample) marker of blastema cells. We expect however blastema cells to express higher levels of stemness markers such as CITED1, SIX1/2, NCAM1.

CITED1 expression
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000125931")

Blastema cells are enriched in CITED1+ cells, but some samples do not express it at all.

NCAM1 expression
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000149294")

NCAM1 seems to be more cancer specific and broadly expressed among samples.

SIX2 expression
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000170577")

Epithelium

CDH1 expression
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000039068")

As expected, epithelial cells express higher expression of CDH1.

PODXL expression
do_Feature_mean(cell_type_df_sub, group.by = "second.level_annotation", feature = "ENSG00000128567")

Create annotation table for export

This section creates the cell type annotation table for export.

annotations_table <- cell_type_df |>
  dplyr::select(
    cell_barcode = barcode,
    scpca_sample_id = sample_id,
    tumor_cell_classification = first.level_annotation,
    cell_type_assignment = second.level_annotation
  ) |>
  mutate(
    # change cancer --> tumor, but keep the other labels
    tumor_cell_classification = ifelse(
      tumor_cell_classification == "cancer", "tumor", tumor_cell_classification
    ),
    cell_type_assignment = str_replace_all(
      cell_type_assignment,
      "cancer ",
      "tumor "
    )
  )

write_tsv(annotations_table, annotations_tsv)

Confirm how many samples we have annotations for:

length(unique(annotations_table$scpca_sample_id))
## [1] 40

Conclusion

  • Combining label transfer and CNV inference we have produced draft annotations for 35/40 Wilms tumor samples in SCPCP000006. We decided not to annotate samples for which the infercnv couldn’t be run with a normal reference, as we don’t trust the results in that case and want to avoid misclassification (SCPCS000177, SCPCS000180, SCPCS000181, SCPCS000190, SCPCS000197).

  • The mean CNV profile of the Wilms tumor cohort seems reasonable in regards to the one published in Cresswell et al, BioRXiv (2024). We identify some commonalities but also some divergence. They can be due to technical limitation of our CNV inference but can also reflect some specificities of this cohort (e.g. pre/post treatment samples). It would be really useful to compare for some sample the infercnv result to a ground truth such as SNP Array (to be requested).

  • The heatmaps of CNV proportion and marker genes support our annotations, but signals with some marker genes are very low. Also, there is no universal marker for each entity of Wilms tumor that cover all tumor cells from all patient. This makes the validation of the annotations quite difficult.

  • However, we could try to take the problem from the other side, and used the current annotation to perform differential expression analysis and try to find marker genes that are consistent across patient and Wilms tumor histologies.

  • In each histology (i.e. epithelial and stroma), the distinction between cancer and non cancer cell is difficult (as expected). In this analysis, we suggested to rely on the CNV score to assess the normality of the cell. Here again, we could try to run differential expression analysis and compare epithelial (resp. stroma) cancer versus non-cancer cells across patient, aiming to find a share transcriptional program allowing the classification cancer versus normal.

  • In our annotation, we haven’t taken into account the favorable/anaplastic status of the sample. However, as anaplasia can occur in every (but do not has to) Wilms tumor histology, I am not sure how to integrate the information into the annotation.

  • This notebook could be finally rendered using different parameters, i.e. threshold for the CNV score and predicted score to use.

Session Info

# record the versions of the packages used in this analysis and other environment information
sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Vienna
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] DT_0.33            patchwork_1.2.0    lubridate_1.9.3    forcats_1.0.0     
##  [5] stringr_1.5.1      dplyr_1.1.4        purrr_1.0.2        readr_2.1.5       
##  [9] tidyr_1.3.1        tibble_3.2.1       ggplot2_3.5.1      tidyverse_2.0.0   
## [13] Seurat_5.1.0       SeuratObject_5.0.2 sp_2.1-4          
## 
## loaded via a namespace (and not attached):
##   [1] RColorBrewer_1.1-3     rstudioapi_0.16.0      jsonlite_1.8.8        
##   [4] magrittr_2.0.3         spatstat.utils_3.1-0   farver_2.1.2          
##   [7] rmarkdown_2.28         vctrs_0.6.5            ROCR_1.0-11           
##  [10] spatstat.explore_3.3-2 htmltools_0.5.8.1      sass_0.4.9            
##  [13] sctransform_0.4.1      parallelly_1.38.0      KernSmooth_2.23-24    
##  [16] bslib_0.8.0            htmlwidgets_1.6.4      ica_1.0-3             
##  [19] plyr_1.8.9             plotly_4.10.4          zoo_1.8-12            
##  [22] cachem_1.1.0           igraph_2.0.3           mime_0.12             
##  [25] lifecycle_1.0.4        pkgconfig_2.0.3        Matrix_1.7-0          
##  [28] R6_2.5.1               fastmap_1.2.0          fitdistrplus_1.2-1    
##  [31] future_1.34.0          shiny_1.9.1            digest_0.6.37         
##  [34] colorspace_2.1-1       rprojroot_2.0.4        tensor_1.5            
##  [37] RSpectra_0.16-2        irlba_2.3.5.1          crosstalk_1.2.1       
##  [40] labeling_0.4.3         progressr_0.14.0       fansi_1.0.6           
##  [43] spatstat.sparse_3.1-0  timechange_0.3.0       httr_1.4.7            
##  [46] polyclip_1.10-7        abind_1.4-5            compiler_4.4.1        
##  [49] bit64_4.0.5            withr_3.0.1            fastDummies_1.7.4     
##  [52] highr_0.11             MASS_7.3-61            tools_4.4.1           
##  [55] lmtest_0.9-40          httpuv_1.6.15          future.apply_1.11.2   
##  [58] goftest_1.2-3          glue_1.7.0             nlme_3.1-166          
##  [61] promises_1.3.0         grid_4.4.1             Rtsne_0.17            
##  [64] cluster_2.1.6          reshape2_1.4.4         generics_0.1.3        
##  [67] gtable_0.3.5           spatstat.data_3.1-2    tzdb_0.4.0            
##  [70] data.table_1.16.0      hms_1.1.3              utf8_1.2.4            
##  [73] spatstat.geom_3.3-2    RcppAnnoy_0.0.22       ggrepel_0.9.5         
##  [76] RANN_2.6.2             pillar_1.9.0           vroom_1.6.5           
##  [79] spam_2.10-0            RcppHNSW_0.6.0         later_1.3.2           
##  [82] splines_4.4.1          lattice_0.22-6         bit_4.0.5             
##  [85] survival_3.7-0         deldir_2.0-4           tidyselect_1.2.1      
##  [88] miniUI_0.1.1.1         pbapply_1.7-2          knitr_1.48            
##  [91] gridExtra_2.3          scattermore_1.2        xfun_0.47             
##  [94] matrixStats_1.3.0      stringi_1.8.4          lazyeval_0.2.2        
##  [97] yaml_2.3.10            evaluate_0.24.0        codetools_0.2-20      
## [100] cli_3.6.3              uwot_0.2.2             xtable_1.8-4          
## [103] reticulate_1.38.0      munsell_0.5.1          jquerylib_0.1.4       
## [106] Rcpp_1.0.14            globals_0.16.3         spatstat.random_3.3-1 
## [109] png_0.1-8              spatstat.univar_3.0-0  parallel_4.4.1        
## [112] dotCall64_1.1-1        listenv_0.9.1          viridisLite_0.4.2     
## [115] scales_1.3.0           ggridges_0.5.6         crayon_1.5.3          
## [118] leiden_0.4.3.1         rlang_1.1.4            cowplot_1.1.3